This file contains time-series plots from the SoS database.
First, we load the SoS database and process the date columns.
# knitr::opts_chunk$set(message = FALSE)
library(plyr) library(dplyr) library(magrittr) library(lubridate) library(ggplot2) library(devtools) load_all() # source("inst/scripts/1-load-and-clean.R", verbose = TRUE) data(sos_raw) sosid <- paste0("SOS", 1:nrow(sos_raw)) date_created <- clean_date_created(sos_raw) date_terminated <- clean_date_terminated(sos_raw)
Next, we create a data frame and use the lubridate
package to create an interval entity_type for each system.
sos_dates <- data.frame(date_created, date_terminated) sos_dates$active_interval <- new_interval(sos_dates$date_created, sos_dates$date_terminated)
We then create a separate data frame with a column of every year from 1900 to present day. Using that column, we create another column, where, for each year, we sum the number of surveillance systems whose interval encompasses that year.
I wrote a short function that takes as its arguments a year and a vector of lubridate
intervals, and returns the number of years that fall within those intervals.
sum_active_systems_for_year <- function(year, intervals) { sum(year %within% intervals, na.rm = TRUE) } time_series <- data.frame(year = parse_date_time(1900:2015, orders = "y")) time_series$number_active <- sapply(time_series$year, sum_active_systems_for_year, intervals = sos_dates$active_interval) # # We could also run this: # time_series %>% # group_by(year) %>% # mutate(number_active = sum(year %within% sos_dates$active_interval, na.rm = TRUE)) %>% # ungroup()
We can then plot this.
# Quick-and-dirty qplot. qplot(x = year, y = number_active, data = time_series, geom = "line") # Cleaner plot using ggplot(). ggplot(time_series, aes(x = year, y = number_active)) + geom_line() + theme_bw() + labs(x = "Year", y = "Count of Active Systems", title = "Number of Active Surveillance Systems over Time") # With number of systems on a log scale. ggplot(time_series, aes(x = year, y = number_active)) + geom_line() + theme_bw() + labs(x = "Year", y = "Count of Active Systems (log scale)", title = "Number of Active Surveillance Systems over Time") + scale_y_log10()
First we're going to split and color the plot by entity type.
This is fucking annoying, as you have to iterate over the levels of entity_type
and the values of year
.
sos_dates$entity_type <- clean_entity_type(sos_raw, return_type = "factor") time_series_entity <- expand.grid(year = time_series$year, entity_type = levels(sos_dates$entity_type)) time_series_entity %<>% group_by(entity_type, year) %>% mutate(number_active = sum_active_systems_for_year(year, sos_dates[sos_dates$entity_type == entity_type, "active_interval"])) %>% ungroup()
ggplot(time_series_entity, aes(x = year, y = number_active, fill = entity_type, color = entity_type, order = desc(entity_type))) + geom_area() + theme_bw() + labs(x = "Year", y = "Count of Active Systems", title = "Number of Active Surveillance Systems over Time by Entity Type (area)") ggplot(time_series_entity, aes(x = year, y = number_active, fill = entity_type, color = entity_type, order = desc(entity_type))) + geom_area() + theme_bw() + labs(x = "Year", y = "Count of Active Systems", title = "Number of Active Surveillance Systems over Time by Entity Type (area)") ggplot(time_series_entity, aes(x = year, y = number_active, fill = entity_type, color = entity_type, order = desc(entity_type))) + geom_area(position = "fill") + theme_bw() + labs(x = "Year", y = "Count of Active Systems", title = "Number of Active Surveillance Systems over Time by Entity Type (area)") ggplot(time_series_entity, aes(x = year, y = number_active, fill = entity_type, color = entity_type, order = desc(entity_type))) + geom_area(position = "identity", alpha = 0.25) + theme_bw() + labs(x = "Year", y = "Count of Active Systems", title = "Number of Active Surveillance Systems over Time by Entity Type") ggplot(time_series_entity, aes(x = year, y = number_active, fill = entity_type, color = entity_type, order = desc(entity_type))) + geom_area(position = "identity", alpha = 0.25) + theme_bw() + scale_y_log10() + labs(x = "Year", y = "Count of Active Systems", title = "Number of Active Surveillance Systems over Time by Entity Type") ggplot(time_series_entity, aes(x = year, y = number_active, fill = entity_type, color = entity_type, order = desc(entity_type))) + geom_line() + theme_bw() + labs(x = "Year", y = "Count of Active Systems", title = "Number of Active Surveillance Systems over Time by Entity Type (line)") ggplot(time_series_entity, aes(x = year, y = number_active, fill = entity_type, color = entity_type, order = desc(entity_type))) + geom_line() + theme_bw() + scale_y_log10() + labs(x = "Year", y = "Count of Active Systems (log-transformed)", title = "Number of Active Surveillance Systems over Time by Entity Type (line)")
library(reshape2) qplot(year(sos_dates$date_created), binwidth = 1) qplot(year(sos_dates$date_terminated), binwidth = 1) date_hist <- data.frame(sosid, select(sos_dates, date_created, date_terminated)) %>% melt(id.vars = "sosid") %>% mutate(year = year(value)) %>% filter(year != 2015, year >= 1950) ggplot() + geom_histogram(data = filter(date_hist, variable == "date_created"), mapping = aes(x = year, y = ..count.., fill = "Created"), binwidth = 1) + geom_histogram(data = filter(date_hist, variable == "date_terminated"), mapping = aes(x = year, y = -..count.., fill = "Terminated"), binwidth = 1) + scale_fill_hue("Group") + theme_bw() + labs(x = "Year", y = "Number of Surveillance Systems", title = "Number of Surveillance Systems Created and Terminated") # ggsave(file = "inst/out/Number of Surveillance Systems Created and Terminated.pdf", width = 6.5, height = 4.5) # ggsave(file = "inst/out/Number of Surveillance Systems Created and Terminated.png", width = 6.5, height = 4.5) date_hist2 <- date_hist %>% group_by(year, variable) %>% summarize(count = n()) %>% dcast(year ~ variable) years_not_in_df <- seq(1950, 2014)[!seq(1950, 2014) %in% date_hist2$year] date_hist2 <- rbind(date_hist2, data.frame(year = years_not_in_df, date_created = NA, date_terminated = NA)) date_hist2[is.na(date_hist2)] <- 0 ggplot(data = date_hist2, aes(x = year)) + geom_bar(aes(y = date_created), stat = "identity", fill = "#F8766D") + geom_bar(aes(y = -date_terminated), stat = "identity", fill = "#00BFC4") + geom_line(aes(y = date_created - date_terminated), size = 0.5) + theme_bw() + labs(x = "Year", y = "Number of Surveillance Systems", title = "Number of Surveillance Systems Created and Terminated") # ggsave(file = "inst/out/Number of Surveillance Systems Created and Terminated (with over-under line).pdf", width = 6.5, height = 4.5) # ggsave(file = "inst/out/Number of Surveillance Systems Created and Terminated (with over-under line).png", width = 6.5, height = 4.5) ggplot() + geom_histogram(data = filter(date_hist, variable == "date_created"), mapping = aes(x = year, y = ..count.., fill = "Created"), binwidth = 1) + geom_histogram(data = filter(date_hist, variable == "date_terminated"), mapping = aes(x = year, y = -..count.., fill = "Terminated"), binwidth = 1) + geom_line(data = time_series, mapping = aes(x = year(year), y = number_active)) + scale_fill_hue("Group") + theme_bw() + labs(x = "Year", y = "Number of Surveillance Systems", title = "Number of Surveillance Systems Created and Terminated")
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.